# =============================================================================
# Calculate the population weighted centroid of each zip code.
# =============================================================================

def isnan2(x):
    try:
        a=np.isnan(x)
    except TypeError:
        a=False
    return a

def downsize(df,types):
#    print(types)
    for var in list(set(list(df)) & set(list(types))):
#        print(var)
        if not isnan2(types[var]):
            if types[var]=='float':
                df[var]=pd.to_numeric(df[var],errors='coerce')
            elif types[var]=='integer':
                df[var]=pd.to_numeric(df[var],errors='coerce')
            elif types[var]=='cat':
                df[var]=df[var].astype('category')
            elif types[var]=='date':
                df[var]=pd.to_datetime(df[var], format='%Y-%m-%d',errors='coerce')
            elif types[var]=='date_safe':
                #Create a dictionary of the date values and then use replace to adjust the column
                try:
                    temp={d:dt.datetime.fromtimestamp(d, dt.timezone.utc) for d in df[var].unique()}
                except:
                    pass
                try:
                    temp={d:dt.datetime.fromtimestamp(d/1000, tz=dt.timezone.utc) for d in df[var].unique()}
                except:
                    continue
                df[var]=df[var].replace(temp)
    return df

#Normally I would calculate the distance with geopandas's distance formula and projection epsg3310 or the distance function from geopy. BUT The projection system in python is Broken
#       So here is the haversine formula for distance
def haversine(coord1,coord2):
    """
    Function accepts the points as tuples (geometry objects) and returns a very rough distance measure.
    """
    R = 6372800  # Earth radius in meters
    lat1, lon1 = coord1
    lat2, lon2 = coord2
    
    phi1, phi2 = math.radians(lat1), math.radians(lat2) 
    dphi       = math.radians(lat2 - lat1)
    dlambda    = math.radians(lon2 - lon1)
    
    a = math.sin(dphi/2)**2 + \
        math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2
    
    return 2*R*math.atan2(math.sqrt(a), math.sqrt(1 - a))


def wavg(group, avg_name, weight_name):
    d = group[avg_name]
    w = group[weight_name]
    try:
        return (d * w).sum() / w.sum()
    except ZeroDivisionError:
        return d.mean()

# =============================================================================
# Import the State shapefiles
# =============================================================================
st_shp = gpd.read_file(cdd['p_d_geo_shp_tiger']+r'/STATE_2019/tl_2019_us_state.zip')
st_shp = st_shp.to_crs('epsg:6933')

#Import the Zip code data
zips = gpd.read_file(cdd['p_d_geo_shp_tiger']+r'/ZCTA5_2019/tl_2019_us_zcta510.zip')
zips = zips.to_crs('epsg:6933')


# =============================================================================
# Import the CBG shapefiles that include demographic data.
# =============================================================================
state_codes = {'PR': '72', 'WA': '53', 'DE': '10', 'DC': '11', 'WI': '55', 'WV': '54', 'HI': '15','FL': '12', 'WY': '56',
               'NJ': '34', 'NM': '35', 'TX': '48','LA': '22', 'NC': '37', 'ND': '38', 'NE': '31', 'TN': '47', 'NY': '36',
                'PA': '42', 'AK': '02', 'NV': '32', 'NH': '33', 'VA': '51', 'CO': '08','CA': '06', 'AL': '01', 'AR': '05',
                'VT': '50', 'IL': '17', 'GA': '13','IN': '18', 'IA': '19', 'MA': '25', 'AZ': '04', 'ID': '16', 'CT': '09',
                'ME': '23', 'MD': '24', 'OK': '40', 'OH': '39', 'UT': '49', 'MO': '29','MN': '27', 'MI': '26', 'RI': '44',
                'KS': '20', 'MT': '30', 'MS': '28','SC': '45', 'KY': '21', 'OR': '41', 'SD': '46', "AS": "60", "MP": "69","VI": "78", "GU": "66"}
#Import all of the shapefiles and extract only the centroid and populations
def pblocks(state):
    file=r'/tabblock2010_'+state+'_pophu.zip'
    try:
        a = gpd.read_file(cdd['p_d_geo_demo']+file)
        return a[['STATEFP10','COUNTYFP10','TRACTCE10','BLOCKCE','BLOCKID10','PARTFLG','HOUSING10','POP10','geometry']]
    except:
        return None
blocks = Parallel(n_jobs=15, prefer="threads")(delayed(pblocks)(state=s) for s in tqdm(list(state_codes.values())))
blocks = pd.concat(blocks,ignore_index=True,sort=False)
blocks.rename(columns={'STATEFP10':'fips_s', 'COUNTYFP10':'fips_c', 'TRACTCE10':'tract', 'BLOCKCE':'blockg', 'BLOCKID10':'bid', 'PARTFLG':'pt_flg', 'HOUSING10':'housing', 'POP10':'pop'},inplace=True)
blocks = blocks.to_crs('epsg:6933')
blocks['centroid'] = blocks.geometry.centroid
blocks.set_geometry('centroid',inplace=True)
blocks.to_pickle(cdd['p_d_geo_demo'] + r'\block_points.pkl')
blocks = pd.read_pickle(cdd['p_d_geo_demo'] + r'\block_points.pkl')


# =============================================================================
# Now find the area of all block groups in the USA. To find the area of a shape you have to use the correct projection
#   Use proj=6933 for areas. The default projection for US shapefiles is 4269. 

#Merge the zip code and block level data so I know which block centroids are in each zip code.
#       Note reprojection is fortunately not necessary (https://gis.stackexchange.com/questions/170839/is-re-projection-needed-from-srid-4326-wgs-84-to-4269-nad-83srid/170854)
bzs = gpd.sjoin(blocks, zips, how="inner", op='within')
bzs.crs
bzs = bzs.to_crs('epsg:6933')
bzs['lat']=bzs.centroid.y
bzs['long']=bzs.centroid.x
wlat = bzs.groupby('ZCTA5CE10').apply(wavg,'lat','pop').reset_index().rename(columns={0:'wlat'})
wlong = bzs.groupby('ZCTA5CE10').apply(wavg,'long','pop').reset_index().rename(columns={0:'wlong'})
zpop = bzs.groupby('ZCTA5CE10')['pop'].sum().reset_index().rename(columns={0:'pop_zip'})
centroids = pd.merge(wlong,wlat,how='left',on='ZCTA5CE10')
centroids = pd.merge(centroids,zpop,how='left',on='ZCTA5CE10')
centroids.to_pickle(cdd['p_d_geo_demo'] + r'\centroids.pkl')


#Merge the results back into a zipcode file
zipdata = pd.merge(zips,centroids,how='left',on='ZCTA5CE10')
zipdata['lat'] = zipdata.geometry.centroid.y
zipdata['long'] = zipdata.geometry.centroid.x
zipdata['density'] = (zipdata['pop'] / zipdata['ALAND10'])*1000000
zipdata.to_pickle(cdd['p_d_geo_demo'] + r'\zip_pop.pkl')
zipdata.drop(columns=['geometry']).to_excel(cdd['p_d_geo_demo'] + r'\zip_pop.xlsx')




